Load libraries

# Load libraries, install if needed
library(tidyverse); theme_set(theme_classic())
library(readxl)
library(tidylog)
library(RCurl)
library(sp)
library(geosphere)
library(viridis)
library(RColorBrewer)
library(patchwork)
library(janitor)
library(icesDatras)
library(mapdata)
library(patchwork)
library(rgdal)
library(raster)
library(sf)
library(rgeos)
library(chron)
library(lattice)
library(ncdf4)
library(sdmTMB) # remotes::install_github("pbs-assess/sdmTMB")
library(marmap)
library(rnaturalearth)
library(rnaturalearthdata)
library(mapplots)
library(mgcv)
library(qwraps2) # To load entire cache in interactive r session, do: qwraps2::lazyload_cache_dir(path = "R/spatio_temporal_indices/html")

# For adding maps to plots
world <- ne_countries(scale = "medium", returnclass = "sf")

# Specify map ranges
ymin = 55.5; ymax = 58; xmin = 12.5; xmax = 20

Saduria

Read data

# sad <- readr::read_csv("https://raw.githubusercontent.com/maxlindmark/bentfish/main/data/for_analysis/saduria.csv") %>%
#   dplyr::select(-X1)
sad <- read.csv("data/for_analysis/saduria.csv") %>% dplyr::select(-X)

sad <- sad %>%
  filter(lat < ymax & lat > ymin & lon > xmin & lon < xmax) %>% 
  filter(year > 2014) %>% 
  mutate(year_f = as.factor(year_f),
         depth_scaled_sq = depth_scaled*depth_scaled)

hist(sad$biomass)


# Plot depth
p1 <- ggplot(sad, aes(depth)) +
  geom_histogram() +
  theme_classic(base_size = 18) +
  ggtitle("all samples")

p2 <- filter(sad, biomass > 0) %>%
  ggplot(., aes(depth)) +
  geom_histogram() +
  theme_classic(base_size = 18) +
  ggtitle("positive biomass")

p1/p2


ggsave("figures/saduria_depth.png", width = 9, height = 9, dpi = 600)

Read and crop pred grid to match saduria

## Read the prediction grid
# pred_grid <- readr::read_csv("https://raw.githubusercontent.com/maxlindmark/bentfish/main/data/for_analysis/pred_grid.csv")
pred_grid <- read.csv("data/for_analysis/pred_grid.csv")

pred_grid <- pred_grid %>%
  mutate(depth_scaled = (depth - mean(sad$depth))/sd(sad$depth)) %>% 
  mutate(depth_scaled_sq = depth_scaled*depth_scaled) %>% 
  mutate(year_f = factor(year),
         year = as.integer(year)) %>% 
  filter(year > 2014) 

# Subset the prediction grid to match saduria data
pred_grid_sad <- pred_grid

tf_sad <- exclude.too.far(pred_grid_sad$lon, pred_grid_sad$lat, sad$lon, sad$lat, 0.01) # 0.03 seems reasonable

# Filter the grid points that are not too far from the data
pred_grid_sad$too_far <- tf_sad

# Plot again
pred_grid_sad %>%
  filter(too_far == FALSE) %>%
  ggplot(., aes(lon, lat, fill = factor(sub_area))) +
  geom_raster() +
  geom_point(data = sad, aes(lon, lat), color = "black", size = 0.5) +
  NULL


pred_grid_sad <- pred_grid_sad %>% filter(too_far == FALSE)

Fit models using sdmTMB assuming a Tweedie distribution and biomass as the response.

Make spde mesh

# Non-island version
sad_spde <- make_mesh(data = sad, xy_cols = c("lon", "lat"), n_knots = 80, type = "kmeans", seed = 42)

Fit the model

m_sad <- sdmTMB(formula = biomass ~ year_f - 1 + depth_scaled + depth_scaled_sq, data = sad,
                time = "year", spde = sad_spde, family = tweedie(link = "log"),
                ar1_fields = TRUE, include_spatial = FALSE, spatial_trend = FALSE, spatial_only = FALSE)

# Check model
print(m_sad)
#> Spatiotemporal model fit by ML ['sdmTMB']
#> Formula: biomass ~ year_f - 1 + depth_scaled + depth_scaled_sq
#> Time column: "year"
#> SPDE: sad_spde
#> Data: sad
#> Family: tweedie(link = 'log')
#>                 coef.est coef.se
#> year_f2015         -1.38    0.76
#> year_f2016         -2.07    0.75
#> year_f2017         -2.89    0.81
#> year_f2018         -1.37    0.70
#> year_f2019         -2.09    0.80
#> depth_scaled        0.46    0.23
#> depth_scaled_sq    -0.26    0.12
#> 
#> Matern range parameter: 0.27
#> Dispersion parameter: 7.36
#> Spatiotemporal SD (sigma_E): 3.32
#> Spatiotemporal AR1 correlation (rho): 0.91
#> ML criterion at convergence: 797.285

# Check AR1 parameter
tidy(m_sad, effects = "ran_pars", conf.int = TRUE) %>% filter(term == "rho")
#> filter: removed 3 rows (75%), one row remaining
#>   term  estimate std.error  conf.low conf.high
#> 1  rho 0.9139087 0.7003771 0.6985128 0.9774585
# Quite large

# Check residuals
d2_sad <- sad
d2_sad$residuals <- residuals(m_sad)

# Pretty good!
qqnorm(d2_sad$residuals); abline(a = 0, b = 1)


# Lastly, plot residuals on a map
ggplot(d2_sad, aes(lon, lat, colour = residuals)) +
  geom_point(size = 0.5) +
  facet_wrap(~year) +
  scale_color_gradient2()


# Plot the marginal effect of depth:
nd_sad <- data.frame(depth_scaled = seq(min(sad$depth_scaled), max(sad$depth_scaled), length.out = 100))
nd_sad$depth_scaled_sq <- nd_sad$depth_scaled*nd_sad$depth_scaled
nd_sad$year <- as.integer(max(sad$year))
nd_sad$year_f <- factor(nd_sad$year)

p_sad <- predict(m_sad, newdata = nd_sad, se_fit = TRUE, re_form = NA)

ggplot(p_sad, aes(depth_scaled, exp(est),
  ymin = exp(est - 1.96 * est_se), ymax = exp(est + 1.96 * est_se))) +
  geom_line() + geom_ribbon(alpha = 0.4)

Make prediction onto the new grid

# Predict using the Tweedie model
sad_pred_grid <- predict(m_sad, newdata = pred_grid_sad)

# Plot predictions!
sad_pred_grid %>% 
  ggplot(., aes(lon, lat, fill = est)) +
  geom_raster() +
  facet_wrap(~year, ncol = 2) +
  scale_fill_viridis(option = "magma", na.value = "transparent") + 
  geom_sf(data = world, inherit.aes = F, size = 0.2) +
  coord_sf(xlim = c(xmin, xmax), ylim = c(ymin, ymax)) +
  ggtitle("Prediction (random + fixed)")
#> Warning: Raster pixels are placed at uneven horizontal intervals and will be
#> shifted. Consider using geom_tile() instead.
#> Warning: Raster pixels are placed at uneven vertical intervals and will be
#> shifted. Consider using geom_tile() instead.


# Plot spatiotemporal random effects
sad_pred_grid %>% 
  ggplot(., aes(lon, lat, fill = epsilon_st)) +
  geom_raster() +
  facet_wrap(~year, ncol = 2) +
  scale_fill_gradient2(na.value = "transparent") +
  geom_sf(data = world, inherit.aes = F, size = 0.2) +
  coord_sf(xlim = c(xmin, xmax), ylim = c(ymin, ymax)) + 
  ggtitle("Spatiotemporal random effect")
#> Warning: Raster pixels are placed at uneven horizontal intervals and will be
#> shifted. Consider using geom_tile() instead.

#> Warning: Raster pixels are placed at uneven vertical intervals and will be
#> shifted. Consider using geom_tile() instead.

Now calculate the index by year and sub area

# From these models, predict annual biomass for each sub area
# sort(unique(sad$sub_area))

# Sub-area 25
sad_preds25 <- predict(m_sad, filter(pred_grid_sad, sub_area == 2),
                       return_tmb_object = TRUE) # Predict using the grid for subarea

sad_ind25 <- get_index(sad_preds25, bias_correct = T) # Get the index (sum of all cells in the pred grid)

# The index is now the sum of grid cells in g/area of a grid cell
# First multiply by the area of a grid cell to get raw g
# Then divide that by the total area
ncells25 <- pred_grid_sad %>% filter(year == 2015 & sub_area == 2) %>% nrow()

# This is the same as simply dividing by ncells2
sad_ind25 <- sad_ind25 %>% mutate(est = est / ncells25,
                                  lwr = lwr / ncells25,
                                  upr = upr / ncells25,
                                  sub_area = 2)

# Sub-area 27
sad_preds27 <- predict(m_sad, filter(pred_grid_sad, sub_area == 4),
                       return_tmb_object = TRUE) # Predict using the grid for subarea

sad_ind27 <- get_index(sad_preds27, bias_correct = T) # Get the index (sum of all cells in the pred grid)

# The index is now the sum of grid cells in g/area of a grid cell
# First multiply by the area of a grid cell to get raw g
# Then divide that by the total area
ncells27 <- pred_grid_sad %>% filter(year == 2015 & sub_area == 4) %>% nrow()

# This is the same as simply dividing by ncells2
sad_ind27 <- sad_ind27 %>% mutate(est = est / ncells27,
                                  lwr = lwr / ncells27,
                                  upr = upr / ncells27,
                                  sub_area = 4)
 
# Sub-area 28
sad_preds28 <- predict(m_sad, filter(pred_grid_sad, sub_area == 5),
                       return_tmb_object = TRUE) # Predict using the grid for subarea

sad_ind28 <- get_index(sad_preds28, bias_correct = T) # Get the index (sum of all cells in the pred grid)

# The index is now the sum of grid cells in g/area of a grid cell
# First multiply by the area of a grid cell to get raw g
# Then divide that by the total area
ncells28 <- pred_grid_sad %>% filter(year == 2015 & sub_area == 5) %>% nrow()

# This is the same as simply dividing by ncells2
sad_ind28 <- sad_ind28 %>% mutate(est = est / ncells28,
                                  lwr = lwr / ncells28,
                                  upr = upr / ncells28,
                                  sub_area = 5)

# Merge prediction-data
sad_preds <- bind_rows(sad_ind25, sad_ind27, sad_ind28)

# Compare to data quickly
p <- sad_preds %>%
  dplyr::select(year, est, upr, lwr, sub_area) %>%
  mutate(source = "pred") %>% 
  arrange(year, sub_area)

d <- sad %>%
  filter(!sub_area == 24) %>% 
  group_by(year, sub_area) %>% 
  summarise(est = mean(biomass)) %>%
  mutate(source = "data") %>% 
  arrange(year, sub_area)
  
ggplot(bind_rows(p, d), aes(year, est, color = source)) +
  geom_point(size = 4) + 
  geom_line(size = 1) + 
  facet_wrap(~ sub_area, scales = "free", ncol = 2) +
  geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.4) + 
  theme_classic(base_size = 15) + 
  ggtitle("Saduria")
#> Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
#> Inf

#> Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
#> Inf

#> Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
#> Inf

Amphipoda

Read data

# amp <- readr::read_csv("https://raw.githubusercontent.com/maxlindmark/bentfish/main/data/for_analysis/amphipoda.csv") %>%
#   dplyr::select(-X1)
amp <- read.csv("data/for_analysis/amphipoda.csv") %>% dplyr::select(-X)

amp <- amp %>%
  filter(lat < ymax & lat > ymin & lon > xmin & lon < xmax) %>% 
  filter(year > 2014) %>% 
  mutate(year_f = as.factor(year_f),
         depth_scaled_sq = depth_scaled*depth_scaled)

unique(is.na(amp))
#>       year month   day   lon   lat species_group depth prov_nr provID sub_area
#> [1,] FALSE FALSE FALSE FALSE FALSE         FALSE FALSE   FALSE  FALSE    FALSE
#>       taxa sample_id depth_scaled year_f biomass abundance biomass_g_m2
#> [1,] FALSE     FALSE        FALSE  FALSE   FALSE     FALSE        FALSE
#>      biomass_g_km2 biomass_kg_km2 depth_scaled_sq
#> [1,]         FALSE          FALSE           FALSE

nrow(amp)
#> [1] 841

hist(amp$biomass)


# Plot depth
p1 <- ggplot(amp, aes(depth)) +
  geom_histogram() +
  theme_classic(base_size = 18) +
  ggtitle("all samples")

p2 <- filter(amp, biomass > 0) %>%
  ggplot(., aes(depth)) +
  geom_histogram() +
  theme_classic(base_size = 18) +
  ggtitle("positive biomass")

p1/p2


ggsave("figures/amphipoda_depth.png", width = 9, height = 9, dpi = 600)

Read and crop pred grid to match amphipoda

## Read the prediction grid
# pred_grid <- readr::read_csv("https://raw.githubusercontent.com/maxlindmark/bentfish/main/data/for_analysis/pred_grid.csv")
pred_grid <- read.csv("data/for_analysis/pred_grid.csv")

pred_grid <- pred_grid %>%
  mutate(depth_scaled = (depth - mean(amp$depth))/sd(amp$depth)) %>% 
  mutate(depth_scaled_sq = depth_scaled*depth_scaled) %>% 
  mutate(year_f = factor(year),
         year = as.integer(year)) %>% 
  filter(year > 2014) 

# Subset the prediction grid to match amphipoda data
pred_grid_amp <- pred_grid

tf_amp <- exclude.too.far(pred_grid_amp$lon, pred_grid_amp$lat, amp$lon, amp$lat, 0.01) # 0.03 seems reasonable

# Filter the grid points that are not too far from the data
pred_grid_amp$too_far <- tf_amp

# Plot again
pred_grid_amp %>%
  filter(too_far == FALSE) %>%
  ggplot(., aes(lon, lat, fill = factor(sub_area))) +
  geom_raster() +
  geom_point(data = amp, aes(lon, lat), color = "black", size = 0.5) +
  NULL


pred_grid_amp <- pred_grid_amp %>% filter(too_far == FALSE)

Fit models using sdmTMB assuming a Tweedie distribution and biomass as the response.

Make spde mesh

# Non-island version
amp_spde <- make_mesh(data = amp, xy_cols = c("lon", "lat"), n_knots = 80, type = "kmeans", seed = 42)

Fit the model

m_amp <- sdmTMB(formula = biomass ~ year_f - 1 + depth_scaled + depth_scaled_sq, data = amp,
                time = "year", spde = amp_spde, family = tweedie(link = "log"),
                ar1_fields = TRUE, include_spatial = FALSE, spatial_trend = FALSE, spatial_only = FALSE)

# Check model
print(m_amp)
#> Spatiotemporal model fit by ML ['sdmTMB']
#> Formula: biomass ~ year_f - 1 + depth_scaled + depth_scaled_sq
#> Time column: "year"
#> SPDE: amp_spde
#> Data: amp
#> Family: tweedie(link = 'log')
#>                 coef.est coef.se
#> year_f2015         -2.38    0.39
#> year_f2016         -2.06    0.34
#> year_f2017         -1.36    0.36
#> year_f2018         -1.52    0.35
#> year_f2019         -1.86    0.38
#> depth_scaled        0.06    0.10
#> depth_scaled_sq    -0.06    0.05
#> 
#> Matern range parameter: 0.21
#> Dispersion parameter: 1.59
#> Spatiotemporal SD (sigma_E): 2.88
#> Spatiotemporal AR1 correlation (rho): 0.84
#> ML criterion at convergence: 529.211

# Check AR1 parameter
tidy(m_amp, effects = "ran_pars", conf.int = TRUE) %>% filter(term == "rho")
#> filter: removed 3 rows (75%), one row remaining
#>   term  estimate std.error  conf.low conf.high
#> 1  rho 0.8431708 0.4655281 0.6503088 0.9339242
# Quite large

# Check residuals
d2_amp <- amp
d2_amp$residuals <- residuals(m_amp)

# Pretty good!
qqnorm(d2_amp$residuals); abline(a = 0, b = 1)


# Lastly, plot residuals on a map
ggplot(d2_amp, aes(lon, lat, colour = residuals)) +
  geom_point(size = 0.5) +
  facet_wrap(~year) +
  scale_color_gradient2()


# Plot the marginal effect of depth:
nd_amp <- data.frame(depth_scaled = seq(min(amp$depth_scaled), max(amp$depth_scaled), length.out = 100))
nd_amp$depth_scaled_sq <- nd_amp$depth_scaled*nd_amp$depth_scaled
nd_amp$year <- as.integer(max(amp$year))
nd_amp$year_f <- factor(nd_amp$year)

p_amp <- predict(m_amp, newdata = nd_amp, se_fit = TRUE, re_form = NA)

ggplot(p_amp, aes(depth_scaled, exp(est),
  ymin = exp(est - 1.96 * est_se), ymax = exp(est + 1.96 * est_se))) +
  geom_line() + geom_ribbon(alpha = 0.4)

Make prediction onto the new grid

# Predict using the Tweedie model
amp_pred_grid <- predict(m_amp, newdata = pred_grid_amp)

# Plot predictions!
amp_pred_grid %>% 
  ggplot(., aes(lon, lat, fill = est)) +
  geom_raster() +
  facet_wrap(~year, ncol = 2) +
  scale_fill_viridis(option = "magma", na.value = "transparent") + 
  geom_sf(data = world, inherit.aes = F, size = 0.2) +
  coord_sf(xlim = c(xmin, xmax), ylim = c(ymin, ymax)) +
  ggtitle("Prediction (random + fixed)")
#> Warning: Raster pixels are placed at uneven horizontal intervals and will be
#> shifted. Consider using geom_tile() instead.
#> Warning: Raster pixels are placed at uneven vertical intervals and will be
#> shifted. Consider using geom_tile() instead.


# Plot spatiotemporal random effects
amp_pred_grid %>% 
  ggplot(., aes(lon, lat, fill = epsilon_st)) +
  geom_raster() +
  facet_wrap(~year, ncol = 2) +
  scale_fill_gradient2(na.value = "transparent") +
  geom_sf(data = world, inherit.aes = F, size = 0.2) +
  coord_sf(xlim = c(xmin, xmax), ylim = c(ymin, ymax)) + 
  ggtitle("Spatiotemporal random effect")
#> Warning: Raster pixels are placed at uneven horizontal intervals and will be
#> shifted. Consider using geom_tile() instead.

#> Warning: Raster pixels are placed at uneven vertical intervals and will be
#> shifted. Consider using geom_tile() instead.

Now calculate the index by year and sub area

# From these models, predict annual biomass for each sub area
# sort(unique(amp$sub_area))

# Sub-area 25
amp_preds25 <- predict(m_amp, filter(pred_grid_amp, sub_area == 2),
                       return_tmb_object = TRUE) # Predict using the grid for subarea

amp_ind25 <- get_index(amp_preds25, bias_correct = T) # Get the index (sum of all cells in the pred grid)

# The index is now the sum of grid cells in g/area of a grid cell
# First multiply by the area of a grid cell to get raw g
# Then divide that by the total area
ncells25 <- pred_grid_amp %>% filter(year == 2015 & sub_area == 2) %>% nrow()

# This is the same as simply dividing by ncells2
amp_ind25 <- amp_ind25 %>% mutate(est = est / ncells25,
                                  lwr = lwr / ncells25,
                                  upr = upr / ncells25,
                                  sub_area = 2)

# Sub-area 27
amp_preds27 <- predict(m_amp, filter(pred_grid_amp, sub_area == 4),
                       return_tmb_object = TRUE) # Predict using the grid for subarea

amp_ind27 <- get_index(amp_preds27, bias_correct = T) # Get the index (sum of all cells in the pred grid)

# The index is now the sum of grid cells in g/area of a grid cell
# First multiply by the area of a grid cell to get raw g
# Then divide that by the total area
ncells27 <- pred_grid_amp %>% filter(year == 2015 & sub_area == 4) %>% nrow()

# This is the same as simply dividing by ncells2
amp_ind27 <- amp_ind27 %>% mutate(est = est / ncells27,
                                  lwr = lwr / ncells27,
                                  upr = upr / ncells27,
                                  sub_area = 4)
 
# Sub-area 28
amp_preds28 <- predict(m_amp, filter(pred_grid_amp, sub_area == 5),
                       return_tmb_object = TRUE) # Predict using the grid for subarea

amp_ind28 <- get_index(amp_preds28, bias_correct = T) # Get the index (sum of all cells in the pred grid)

# The index is now the sum of grid cells in g/area of a grid cell
# First multiply by the area of a grid cell to get raw g
# Then divide that by the total area
ncells28 <- pred_grid_amp %>% filter(year == 2015 & sub_area == 5) %>% nrow()

# This is the same as simply dividing by ncells2
amp_ind28 <- amp_ind28 %>% mutate(est = est / ncells28,
                                  lwr = lwr / ncells28,
                                  upr = upr / ncells28,
                                  sub_area = 5)

# Merge prediction-data
amp_preds <- bind_rows(amp_ind25, amp_ind27, amp_ind28)

# Compare to data quickly
p <- amp_preds %>%
  dplyr::select(year, est, upr, lwr, sub_area) %>%
  mutate(source = "pred") %>% 
  arrange(year, sub_area)

d <- amp %>%
  filter(!sub_area == 24) %>% 
  group_by(year, sub_area) %>% 
  summarise(est = mean(biomass)) %>%
  mutate(source = "data") %>% 
  arrange(year, sub_area)
  
ggplot(bind_rows(p, d), aes(year, est, color = source)) +
  geom_point(size = 4) + 
  geom_line(size = 1) + 
  facet_wrap(~ sub_area, scales = "free", ncol = 2) +
  geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.4) + 
  theme_classic(base_size = 15) + 
  ggtitle("Amphipoda")
#> Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
#> Inf

#> Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
#> Inf

#> Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
#> Inf

Mytiloida

Read data

# myt <- readr::read_csv("https://raw.githubusercontent.com/maxlindmark/bentfish/main/data/for_analysis/mytiloida.csv") %>%
#   dplyr::select(-X1)
myt <- read.csv("data/for_analysis/mytiloida.csv") %>% dplyr::select(-X)

myt <- myt %>%
  filter(lat < ymax & lat > ymin & lon > xmin & lon < xmax) %>% 
  filter(year > 2014) %>% 
  mutate(year_f = as.factor(year_f),
         depth_scaled_sq = depth_scaled*depth_scaled)

unique(is.na(myt))
#>       year month   day   lon   lat abundance biomass species_group depth
#> [1,] FALSE FALSE FALSE FALSE FALSE     FALSE   FALSE         FALSE FALSE
#>      prov_nr provID sub_area  taxa sample_id depth_scaled year_f biomass_g_m2
#> [1,]   FALSE  FALSE    FALSE FALSE     FALSE        FALSE  FALSE        FALSE
#>      biomass_g_km2 biomass_kg_km2 depth_scaled_sq
#> [1,]         FALSE          FALSE           FALSE

nrow(myt)
#> [1] 841

hist(myt$biomass)


# Plot depth
p1 <- ggplot(myt, aes(depth)) +
  geom_histogram() +
  theme_classic(base_size = 18) +
  ggtitle("all samples")

p2 <- filter(myt, biomass > 0) %>%
  ggplot(., aes(depth)) +
  geom_histogram() +
  theme_classic(base_size = 18) +
  ggtitle("positive biomass")

p1/p2


ggsave("figures/mytiloida_depth.png", width = 9, height = 9, dpi = 600)

Read and crop pred grid to match mytiloida

## Read the prediction grid
# pred_grid <- readr::read_csv("https://raw.githubusercontent.com/maxlindmark/bentfish/main/data/for_analysis/pred_grid.csv")
pred_grid <- read.csv("data/for_analysis/pred_grid.csv")

pred_grid <- pred_grid %>%
  mutate(depth_scaled = (depth - mean(myt$depth))/sd(myt$depth)) %>% 
  mutate(depth_scaled_sq = depth_scaled*depth_scaled) %>% 
  mutate(year_f = factor(year),
         year = as.integer(year)) %>% 
  filter(year > 2014) 

# Subset the prediction grid to match mytiloida data
pred_grid_myt <- pred_grid

tf_myt <- exclude.too.far(pred_grid_myt$lon, pred_grid_myt$lat, myt$lon, myt$lat, 0.01) # 0.03 seems reasonable

# Filter the grid points that are not too far from the data
pred_grid_myt$too_far <- tf_myt

# Plot again
pred_grid_myt %>%
  filter(too_far == FALSE) %>%
  ggplot(., aes(lon, lat, fill = factor(sub_area))) +
  geom_raster() +
  geom_point(data = myt, aes(lon, lat), color = "black", size = 0.5) +
  NULL


pred_grid_myt <- pred_grid_myt %>% filter(too_far == FALSE)

Fit models using sdmTMB assuming a Tweedie distribution and biomass as the response.

Make spde mesh

# Non-island version
myt_spde <- make_mesh(data = myt, xy_cols = c("lon", "lat"), n_knots = 80, type = "kmeans", seed = 42)

Fit the model

m_myt <- sdmTMB(formula = biomass ~ year_f - 1 + depth_scaled + depth_scaled_sq, data = myt,
                time = "year", spde = myt_spde, family = tweedie(link = "log"),
                ar1_fields = TRUE, include_spatial = FALSE, spatial_trend = FALSE, spatial_only = FALSE)

# Check model
print(m_myt)
#> Spatiotemporal model fit by ML ['sdmTMB']
#> Formula: biomass ~ year_f - 1 + depth_scaled + depth_scaled_sq
#> Time column: "year"
#> SPDE: myt_spde
#> Data: myt
#> Family: tweedie(link = 'log')
#>                 coef.est coef.se
#> year_f2015         -0.11    0.71
#> year_f2016          0.10    0.66
#> year_f2017          0.78    0.69
#> year_f2018          0.48    0.67
#> year_f2019          1.24    0.71
#> depth_scaled       -0.40    0.16
#> depth_scaled_sq     0.06    0.11
#> 
#> Matern range parameter: 0.15
#> Dispersion parameter: 9.14
#> Spatiotemporal SD (sigma_E): 5.59
#> Spatiotemporal AR1 correlation (rho): 0.97
#> ML criterion at convergence: 1554.628

# Check AR1 parameter
tidy(m_myt, effects = "ran_pars", conf.int = TRUE) %>% filter(term == "rho")
#> filter: removed 3 rows (75%), one row remaining
#>   term  estimate std.error  conf.low conf.high
#> 1  rho 0.9693264 0.6603379 0.8924637 0.9914975
# Quite large

# Check residuals
d2_myt <- myt
d2_myt$residuals <- residuals(m_myt)

# Pretty good!
qqnorm(d2_myt$residuals); abline(a = 0, b = 1)


# Lastly, plot residuals on a map
ggplot(d2_myt, aes(lon, lat, colour = residuals)) +
  geom_point(size = 0.5) +
  facet_wrap(~year) +
  scale_color_gradient2()


# Plot the marginal effect of depth:
nd_myt <- data.frame(depth_scaled = seq(min(myt$depth_scaled), max(myt$depth_scaled), length.out = 100))
nd_myt$depth_scaled_sq <- nd_myt$depth_scaled*nd_myt$depth_scaled
nd_myt$year <- as.integer(max(myt$year))
nd_myt$year_f <- factor(nd_myt$year)

p_myt <- predict(m_myt, newdata = nd_myt, se_fit = TRUE, re_form = NA)

ggplot(p_myt, aes(depth_scaled, exp(est),
  ymin = exp(est - 1.96 * est_se), ymax = exp(est + 1.96 * est_se))) +
  geom_line() + geom_ribbon(alpha = 0.4)

Make prediction onto the new grid

# Predict using the Tweedie model
myt_pred_grid <- predict(m_myt, newdata = pred_grid_myt)

# Plot predictions!
myt_pred_grid %>% 
  ggplot(., aes(lon, lat, fill = est)) +
  geom_raster() +
  facet_wrap(~year, ncol = 2) +
  scale_fill_viridis(option = "magma", na.value = "transparent") + 
  geom_sf(data = world, inherit.aes = F, size = 0.2) +
  coord_sf(xlim = c(xmin, xmax), ylim = c(ymin, ymax)) +
  ggtitle("Prediction (random + fixed)")
#> Warning: Raster pixels are placed at uneven horizontal intervals and will be
#> shifted. Consider using geom_tile() instead.
#> Warning: Raster pixels are placed at uneven vertical intervals and will be
#> shifted. Consider using geom_tile() instead.


# Plot spatiotemporal random effects
myt_pred_grid %>% 
  ggplot(., aes(lon, lat, fill = epsilon_st)) +
  geom_raster() +
  facet_wrap(~year, ncol = 2) +
  scale_fill_gradient2(na.value = "transparent") +
  geom_sf(data = world, inherit.aes = F, size = 0.2) +
  coord_sf(xlim = c(xmin, xmax), ylim = c(ymin, ymax)) + 
  ggtitle("Spatiotemporal random effect")
#> Warning: Raster pixels are placed at uneven horizontal intervals and will be
#> shifted. Consider using geom_tile() instead.

#> Warning: Raster pixels are placed at uneven vertical intervals and will be
#> shifted. Consider using geom_tile() instead.

Now calculate the index by year and sub area

# From these models, predict annual biomass for each sub area
# sort(unique(myt$sub_area))

# Sub-area 25
myt_preds25 <- predict(m_myt, filter(pred_grid_myt, sub_area == 2),
                   return_tmb_object = TRUE) # Predict using the grid for subarea

myt_ind25 <- get_index(myt_preds25, bias_correct = T) # Get the index (sum of all cells in the pred grid)

# The index is now the sum of grid cells in g/area of a grid cell
# First multiply by the area of a grid cell to get raw g
# Then divide that by the total area
ncells25 <- pred_grid_myt %>% filter(year == 2015 & sub_area == 2) %>% nrow()

# This is the same as simply dividing by ncells2
myt_ind25 <- myt_ind25 %>% mutate(est = est / ncells25,
                                  lwr = lwr / ncells25,
                                  upr = upr / ncells25,
                                  sub_area = 2)

# Sub-area 27
myt_preds27 <- predict(m_myt, filter(pred_grid_myt, sub_area == 4),
                       return_tmb_object = TRUE) # Predict using the grid for subarea

myt_ind27 <- get_index(myt_preds27, bias_correct = T) # Get the index (sum of all cells in the pred grid)

# The index is now the sum of grid cells in g/area of a grid cell
# First multiply by the area of a grid cell to get raw g
# Then divide that by the total area
ncells27 <- pred_grid_myt %>% filter(year == 2015 & sub_area == 4) %>% nrow()

# This is the same as simply dividing by ncells2
myt_ind27 <- myt_ind27 %>% mutate(est = est / ncells27,
                                  lwr = lwr / ncells27,
                                  upr = upr / ncells27,
                                  sub_area = 4)
 
# Sub-area 28
myt_preds28 <- predict(m_myt, filter(pred_grid_myt, sub_area == 5),
                       return_tmb_object = TRUE) # Predict using the grid for subarea

myt_ind28 <- get_index(myt_preds28, bias_correct = T) # Get the index (sum of all cells in the pred grid)

# The index is now the sum of grid cells in g/area of a grid cell
# First multiply by the area of a grid cell to get raw g
# Then divide that by the total area
ncells28 <- pred_grid_myt %>% filter(year == 2015 & sub_area == 5) %>% nrow()

# This is the same as simply dividing by ncells2
myt_ind28 <- myt_ind28 %>% mutate(est = est / ncells28,
                                  lwr = lwr / ncells28,
                                  upr = upr / ncells28,
                                  sub_area = 5)

# Merge prediction-data
myt_preds <- bind_rows(myt_ind25, myt_ind27, myt_ind28)

# Compare to data quickly
p <- myt_preds %>%
  dplyr::select(year, est, upr, lwr, sub_area) %>%
  mutate(source = "pred") %>% 
  arrange(year, sub_area)

d <- myt %>%
  filter(!sub_area == 24) %>% 
  group_by(year, sub_area) %>% 
  summarise(est = mean(biomass)) %>% 
  mutate(source = "data") %>% 
  arrange(year, sub_area)
  
ggplot(bind_rows(p, d), aes(year, est, color = source)) +
  geom_point(size = 4) + 
  geom_line(size = 1) + 
  facet_wrap(~ sub_area, scales = "free", ncol = 2) +
  geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.4) + 
  theme_classic(base_size = 15) + 
  ggtitle("Mytiloida")
#> Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
#> Inf

#> Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
#> Inf

#> Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
#> Inf

Limecola baltica

Read data

# lim <- readr::read_csv("https://raw.githubusercontent.com/maxlindmark/bentfish/main/data/for_analysis/limecola.csv") %>%
#   dplyr::select(-X1)
lim <- read.csv("data/for_analysis/limecola.csv") %>% dplyr::select(-X)

lim <- lim %>%
  filter(lat < ymax & lat > ymin & lon > xmin & lon < xmax) %>% 
  filter(year > 2014) %>% 
  mutate(year_f = as.factor(year_f),
         depth_scaled_sq = depth_scaled*depth_scaled)

unique(is.na(lim))
#>       year month   day   lon   lat abundance biomass species_group depth
#> [1,] FALSE FALSE FALSE FALSE FALSE     FALSE   FALSE         FALSE FALSE
#>      prov_nr provID sub_area  taxa sample_id depth_scaled year_f biomass_g_m2
#> [1,]   FALSE  FALSE    FALSE FALSE     FALSE        FALSE  FALSE        FALSE
#>      biomass_g_km2 biomass_kg_km2 depth_scaled_sq
#> [1,]         FALSE          FALSE           FALSE

nrow(lim)
#> [1] 841

hist(lim$biomass)


# Plot depth
p1 <- ggplot(lim, aes(depth)) +
  geom_histogram() +
  theme_classic(base_size = 18) +
  ggtitle("all samples")

p2 <- filter(lim, biomass > 0) %>%
  ggplot(., aes(depth)) +
  geom_histogram() +
  theme_classic(base_size = 18) +
  ggtitle("positive biomass")

p1/p2


ggsave("figures/limecola_depth.png", width = 9, height = 9, dpi = 600)

Read and crop pred grid to match limecola

## Read the prediction grid
# pred_grid <- readr::read_csv("https://raw.githubusercontent.com/maxlindmark/bentfish/main/data/for_analysis/pred_grid.csv")
pred_grid <- read.csv("data/for_analysis/pred_grid.csv")

pred_grid <- pred_grid %>%
  mutate(depth_scaled = (depth - mean(lim$depth))/sd(lim$depth)) %>% 
  mutate(depth_scaled_sq = depth_scaled*depth_scaled) %>% 
  mutate(year_f = factor(year),
         year = as.integer(year)) %>% 
  filter(year > 2014) 

# Subset the prediction grid to match limecole data
pred_grid_lim <- pred_grid

tf_lim <- exclude.too.far(pred_grid_lim$lon, pred_grid_lim$lat, lim$lon, lim$lat, 0.01) # 0.03 seems reasonable

# Filter the grid points that are not too far from the data
pred_grid_lim$too_far <- tf_lim

# Plot again
pred_grid_lim %>%
  filter(too_far == FALSE) %>%
  ggplot(., aes(lon, lat, fill = factor(sub_area))) +
  geom_raster() +
  geom_point(data = lim, aes(lon, lat), color = "black", size = 0.5) +
  NULL


pred_grid_lim <- pred_grid_lim %>% filter(too_far == FALSE)

Fit models using sdmTMB assuming a Tweedie distribution and biomass as the response.

Make spde mesh

# Non-island version
lim_spde <- make_mesh(data = lim, xy_cols = c("lon", "lat"), n_knots = 80, type = "kmeans", seed = 42)

Fit the model

m_lim <- sdmTMB(formula = biomass ~ year_f - 1 + depth_scaled + depth_scaled_sq, data = lim,
                time = "year", spde = lim_spde, family = tweedie(link = "log"),
                ar1_fields = TRUE, include_spatial = FALSE, spatial_trend = FALSE, spatial_only = FALSE)

# Check model
print(m_lim)
#> Spatiotemporal model fit by ML ['sdmTMB']
#> Formula: biomass ~ year_f - 1 + depth_scaled + depth_scaled_sq
#> Time column: "year"
#> SPDE: lim_spde
#> Data: lim
#> Family: tweedie(link = 'log')
#>                 coef.est coef.se
#> year_f2015          3.76    0.19
#> year_f2016          3.65    0.18
#> year_f2017          3.58    0.19
#> year_f2018          3.61    0.18
#> year_f2019          3.87    0.19
#> depth_scaled       -0.01    0.04
#> depth_scaled_sq    -0.03    0.02
#> 
#> Matern range parameter: 0.30
#> Dispersion parameter: 2.65
#> Spatiotemporal SD (sigma_E): 0.94
#> Spatiotemporal AR1 correlation (rho): 0.97
#> ML criterion at convergence: 3829.140

# Check AR1 parameter
tidy(m_lim, effects = "ran_pars", conf.int = TRUE) %>% filter(term == "rho")
#> filter: removed 3 rows (75%), one row remaining
#>   term  estimate std.error  conf.low conf.high
#> 1  rho 0.9705629 0.5961392 0.9082966 0.9907554
# Quite large

# Check residuals
d2_lim <- lim
d2_lim$residuals <- residuals(m_lim)

# OK
qqnorm(d2_lim$residuals); abline(a = 0, b = 1)


# Lastly, plot residuals on a map
ggplot(d2_lim, aes(lon, lat, colour = residuals)) +
  geom_point(size = 0.5) +
  facet_wrap(~year) +
  scale_color_gradient2()


# Plot the marginal effect of depth:
nd_lim <- data.frame(depth_scaled = seq(min(lim$depth_scaled), max(lim$depth_scaled), length.out = 100))
nd_lim$depth_scaled_sq <- nd_lim$depth_scaled*nd_lim$depth_scaled
nd_lim$year <- as.integer(max(lim$year))
nd_lim$year_f <- factor(nd_lim$year)

p_lim <- predict(m_lim, newdata = nd_lim, se_fit = TRUE, re_form = NA)

ggplot(p_lim, aes(depth_scaled, exp(est),
  ymin = exp(est - 1.96 * est_se), ymax = exp(est + 1.96 * est_se))) +
  geom_line() + geom_ribbon(alpha = 0.4)

Make prediction onto the new grid

# Predict using the Tweedie model
lim_pred_grid <- predict(m_lim, newdata = pred_grid_lim)

# Plot predictions!
lim_pred_grid %>% 
  ggplot(., aes(lon, lat, fill = est)) +
  geom_raster() +
  facet_wrap(~year, ncol = 2) +
  scale_fill_viridis(option = "magma", na.value = "transparent") + 
  geom_sf(data = world, inherit.aes = F, size = 0.2) +
  coord_sf(xlim = c(xmin, xmax), ylim = c(ymin, ymax)) +
  ggtitle("Prediction (random + fixed)")
#> Warning: Raster pixels are placed at uneven horizontal intervals and will be
#> shifted. Consider using geom_tile() instead.
#> Warning: Raster pixels are placed at uneven vertical intervals and will be
#> shifted. Consider using geom_tile() instead.


# Plot spatiotemporal random effects
lim_pred_grid %>% 
  ggplot(., aes(lon, lat, fill = epsilon_st)) +
  geom_raster() +
  facet_wrap(~year, ncol = 2) +
  scale_fill_gradient2(na.value = "transparent") +
  geom_sf(data = world, inherit.aes = F, size = 0.2) +
  coord_sf(xlim = c(xmin, xmax), ylim = c(ymin, ymax)) + 
  ggtitle("Spatiotemporal random effect")
#> Warning: Raster pixels are placed at uneven horizontal intervals and will be
#> shifted. Consider using geom_tile() instead.

#> Warning: Raster pixels are placed at uneven vertical intervals and will be
#> shifted. Consider using geom_tile() instead.

Now calculate the index by year and sub area

# From these models, predict annual biomass for each sub area
# sort(unique(lim$sub_area))

# Sub-area 25
lim_preds25 <- predict(m_lim, filter(pred_grid_lim, sub_area == 2),
                       return_tmb_object = TRUE) # Predict using the grid for subarea

lim_ind25 <- get_index(lim_preds25, bias_correct = T) # Get the index (sum of all cells in the pred grid)

# The index is now the sum of grid cells in g/area of a grid cell
# First multiply by the area of a grid cell to get raw g
# Then divide that by the total area
ncells25 <- pred_grid_lim %>% filter(year == 2015 & sub_area == 2) %>% nrow()

# This is the same as simply dividing by ncells2
lim_ind25 <- lim_ind25 %>% mutate(est = est / ncells25,
                                  lwr = lwr / ncells25,
                                  upr = upr / ncells25,
                                  sub_area = 2)

# Sub-area 27
lim_preds27 <- predict(m_lim, filter(pred_grid_lim, sub_area == 4),
                       return_tmb_object = TRUE) # Predict using the grid for subarea

lim_ind27 <- get_index(lim_preds27, bias_correct = T) # Get the index (sum of all cells in the pred grid)

# The index is now the sum of grid cells in g/area of a grid cell
# First multiply by the area of a grid cell to get raw g
# Then divide that by the total area
ncells27 <- pred_grid_lim %>% filter(year == 2015 & sub_area == 4) %>% nrow()

# This is the same as simply dividing by ncells2
lim_ind27 <- lim_ind27 %>% mutate(est = est / ncells27,
                                  lwr = lwr / ncells27,
                                  upr = upr / ncells27,
                                  sub_area = 4)
 
# Sub-area 28
lim_preds28 <- predict(m_lim, filter(pred_grid_lim, sub_area == 5),
                       return_tmb_object = TRUE) # Predict using the grid for subarea

lim_ind28 <- get_index(lim_preds28, bias_correct = T) # Get the index (sum of all cells in the pred grid)

# The index is now the sum of grid cells in g/area of a grid cell
# First multiply by the area of a grid cell to get raw g
# Then divide that by the total area
ncells28 <- pred_grid_lim %>% filter(year == 2015 & sub_area == 5) %>% nrow()

# This is the same as simply dividing by ncells2
lim_ind28 <- lim_ind28 %>% mutate(est = est / ncells28,
                                  lwr = lwr / ncells28,
                                  upr = upr / ncells28,
                                  sub_area = 5)

# Merge prediction-data
lim_preds <- bind_rows(lim_ind25, lim_ind27, lim_ind28)

# Compare to data quickly
p <- lim_preds %>%
  dplyr::select(year, est, upr, lwr, sub_area) %>%
  mutate(source = "pred") %>% 
  arrange(year, sub_area)

d <- lim %>%
  filter(!sub_area == 24) %>% 
  group_by(year, sub_area) %>% 
  summarise(est = mean(biomass)) %>% 
  mutate(source = "data") %>% 
  arrange(year, sub_area)
  
ggplot(bind_rows(p, d), aes(year, est, color = source)) +
  geom_point(size = 4) + 
  geom_line(size = 1) + 
  facet_wrap(~ sub_area, scales = "free", ncol = 2) +
  geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.4) + 
  theme_classic(base_size = 15) + 
  ggtitle("Limecola")
#> Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
#> Inf

#> Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
#> Inf

#> Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
#> Inf

Polychaeta

Read data

# pol <- readr::read_csv("https://raw.githubusercontent.com/maxlindmark/bentfish/main/data/for_analysis/polychaeta.csv") %>%
#   dplyr::select(-X1)
pol <- read.csv("data/for_analysis/polychaeta.csv") %>% dplyr::select(-X)

pol <- pol %>%
  filter(lat < ymax & lat > ymin & lon > xmin & lon < xmax) %>% 
  filter(year > 2014) %>% 
  mutate(year_f = as.factor(year_f),
         depth_scaled_sq = depth_scaled*depth_scaled)

unique(is.na(pol))
#>       year month   day   lon   lat species_group depth prov_nr provID sub_area
#> [1,] FALSE FALSE FALSE FALSE FALSE         FALSE FALSE   FALSE  FALSE    FALSE
#>       taxa sample_id depth_scaled year_f biomass abundance biomass_g_m2
#> [1,] FALSE     FALSE        FALSE  FALSE   FALSE     FALSE        FALSE
#>      biomass_g_km2 biomass_kg_km2 depth_scaled_sq
#> [1,]         FALSE          FALSE           FALSE

nrow(pol)
#> [1] 841

hist(pol$biomass)


# Plot depth
p1 <- ggplot(pol, aes(depth)) +
  geom_histogram() +
  theme_classic(base_size = 18) +
  ggtitle("all samples")

p2 <- filter(pol, biomass > 0) %>%
  ggplot(., aes(depth)) +
  geom_histogram() +
  theme_classic(base_size = 18) +
  ggtitle("positive biomass")

p1/p2


ggsave("figures/polychaeta_depth.png", width = 9, height = 9, dpi = 600)

Read and crop pred grid to match polychaeta

## Read the prediction grid
# pred_grid <- readr::read_csv("https://raw.githubusercontent.com/maxlindmark/bentfish/main/data/for_analysis/pred_grid.csv")
pred_grid <- read.csv("data/for_analysis/pred_grid.csv")

pred_grid <- pred_grid %>%
  mutate(depth_scaled = (depth - mean(pol$depth))/sd(pol$depth)) %>% 
  mutate(depth_scaled_sq = depth_scaled*depth_scaled) %>% 
  mutate(year_f = factor(year),
         year = as.integer(year)) %>% 
  filter(year > 2014) 

# Subset the prediction grid to match polychaeta data
pred_grid_pol <- pred_grid

tf_pol <- exclude.too.far(pred_grid_pol$lon, pred_grid_pol$lat, pol$lon, pol$lat, 0.01) # 0.03 seems reasonable

# Filter the grid points that are not too far from the data
pred_grid_pol$too_far <- tf_pol

# Plot again
pred_grid_pol %>%
  filter(too_far == FALSE) %>%
  ggplot(., aes(lon, lat, fill = factor(sub_area))) +
  geom_raster() +
  geom_point(data = pol, aes(lon, lat), color = "black", size = 0.5) +
  NULL


pred_grid_pol <- pred_grid_pol %>% filter(too_far == FALSE)

Fit models using sdmTMB assuming a Tweedie distribution and biomass as the response.

Make spde mesh

# Non-island version
pol_spde <- make_mesh(data = pol, xy_cols = c("lon", "lat"), n_knots = 80, type = "kmeans", seed = 42)

Fit the model

m_pol <- sdmTMB(formula = biomass ~ year_f - 1 + depth_scaled + depth_scaled_sq, data = pol,
                time = "year", spde = pol_spde, family = tweedie(link = "log"),
                ar1_fields = TRUE, include_spatial = FALSE, spatial_trend = FALSE, spatial_only = FALSE)

# Check model
print(m_pol)
#> Spatiotemporal model fit by ML ['sdmTMB']
#> Formula: biomass ~ year_f - 1 + depth_scaled + depth_scaled_sq
#> Time column: "year"
#> SPDE: pol_spde
#> Data: pol
#> Family: tweedie(link = 'log')
#>                 coef.est coef.se
#> year_f2015          0.34    0.23
#> year_f2016          0.82    0.21
#> year_f2017          0.74    0.22
#> year_f2018          1.15    0.21
#> year_f2019          1.12    0.23
#> depth_scaled       -0.14    0.06
#> depth_scaled_sq     0.06    0.04
#> 
#> Matern range parameter: 0.10
#> Dispersion parameter: 1.81
#> Spatiotemporal SD (sigma_E): 2.51
#> Spatiotemporal AR1 correlation (rho): 0.99
#> ML criterion at convergence: 1934.120

# Check AR1 parameter
tidy(m_pol, effects = "ran_pars", conf.int = TRUE) %>% filter(term == "rho")
#> filter: removed 3 rows (75%), one row remaining
#>   term  estimate std.error  conf.low conf.high
#> 1  rho 0.9906931  1.007708 0.9348073 0.9987035
# Quite large

# Check residuals
d2_pol <- pol
d2_pol$residuals <- residuals(m_pol)

# OK
qqnorm(d2_pol$residuals); abline(a = 0, b = 1)


# Lastly, plot residuals on a map
ggplot(d2_pol, aes(lon, lat, colour = residuals)) +
  geom_point(size = 0.5) +
  facet_wrap(~year) +
  scale_color_gradient2()


# Plot the marginal effect of depth:
nd_pol <- data.frame(depth_scaled = seq(min(pol$depth_scaled), max(pol$depth_scaled), length.out = 100))
nd_pol$depth_scaled_sq <- nd_pol$depth_scaled*nd_pol$depth_scaled
nd_pol$year <- as.integer(max(pol$year))
nd_pol$year_f <- factor(nd_pol$year)

p_pol <- predict(m_pol, newdata = nd_pol, se_fit = TRUE, re_form = NA)

ggplot(p_pol, aes(depth_scaled, exp(est),
  ymin = exp(est - 1.96 * est_se), ymax = exp(est + 1.96 * est_se))) +
  geom_line() + geom_ribbon(alpha = 0.4)

Make prediction onto the new grid

# Predict using the Tweedie model
pol_pred_grid <- predict(m_pol, newdata = pred_grid_pol)

# Plot predictions!
pol_pred_grid %>% 
  ggplot(., aes(lon, lat, fill = est)) +
  geom_raster() +
  facet_wrap(~year, ncol = 2) +
  scale_fill_viridis(option = "magma", na.value = "transparent") + 
  geom_sf(data = world, inherit.aes = F, size = 0.2) +
  coord_sf(xlim = c(xmin, xmax), ylim = c(ymin, ymax)) +
  ggtitle("Prediction (random + fixed)")
#> Warning: Raster pixels are placed at uneven horizontal intervals and will be
#> shifted. Consider using geom_tile() instead.
#> Warning: Raster pixels are placed at uneven vertical intervals and will be
#> shifted. Consider using geom_tile() instead.


# Plot spatiotemporal random effects
pol_pred_grid %>% 
  ggplot(., aes(lon, lat, fill = epsilon_st)) +
  geom_raster() +
  facet_wrap(~year, ncol = 2) +
  scale_fill_gradient2(na.value = "transparent") +
  geom_sf(data = world, inherit.aes = F, size = 0.2) +
  coord_sf(xlim = c(xmin, xmax), ylim = c(ymin, ymax)) + 
  ggtitle("Spatiotemporal random effect")
#> Warning: Raster pixels are placed at uneven horizontal intervals and will be
#> shifted. Consider using geom_tile() instead.

#> Warning: Raster pixels are placed at uneven vertical intervals and will be
#> shifted. Consider using geom_tile() instead.

Now calculate the index by year and sub area

# From these models, predict annual biomass for each sub area
# sort(unique(pol$sub_area))

# Sub-area 25
pol_preds25 <- predict(m_pol, filter(pred_grid_pol, sub_area == 2),
                       return_tmb_object = TRUE) # Predict using the grid for subarea

pol_ind25 <- get_index(pol_preds25, bias_correct = T) # Get the index (sum of all cells in the pred grid)

# The index is now the sum of grid cells in g/area of a grid cell
# First multiply by the area of a grid cell to get raw g
# Then divide that by the total area
ncells25 <- pred_grid_pol %>% filter(year == 2015 & sub_area == 2) %>% nrow()

# This is the same as simply dividing by ncells2
pol_ind25 <- pol_ind25 %>% mutate(est = est / ncells25,
                                  lwr = lwr / ncells25,
                                  upr = upr / ncells25,
                                  sub_area = 2)

# Sub-area 27
pol_preds27 <- predict(m_pol, filter(pred_grid_pol, sub_area == 4),
                       return_tmb_object = TRUE) # Predict using the grid for subarea

pol_ind27 <- get_index(pol_preds27, bias_correct = T) # Get the index (sum of all cells in the pred grid)

# The index is now the sum of grid cells in g/area of a grid cell
# First multiply by the area of a grid cell to get raw g
# Then divide that by the total area
ncells27 <- pred_grid_pol %>% filter(year == 2015 & sub_area == 4) %>% nrow()

# This is the same as simply dividing by ncells2
pol_ind27 <- pol_ind27 %>% mutate(est = est / ncells27,
                                  lwr = lwr / ncells27,
                                  upr = upr / ncells27,
                                  sub_area = 4)
 
# Sub-area 28
pol_preds28 <- predict(m_pol, filter(pred_grid_pol, sub_area == 5),
                       return_tmb_object = TRUE) # Predict using the grid for subarea

pol_ind28 <- get_index(pol_preds28, bias_correct = T) # Get the index (sum of all cells in the pred grid)

# The index is now the sum of grid cells in g/area of a grid cell
# First multiply by the area of a grid cell to get raw g
# Then divide that by the total area
ncells28 <- pred_grid_pol %>% filter(year == 2015 & sub_area == 5) %>% nrow()

# This is the same as simply dividing by ncells2
pol_ind28 <- pol_ind28 %>% mutate(est = est / ncells28,
                                  lwr = lwr / ncells28,
                                  upr = upr / ncells28,
                                  sub_area = 5)

# Merge prediction-data
pol_preds <- bind_rows(pol_ind25, pol_ind27, pol_ind28)

# Compare to data quickly
p <- pol_preds %>%
  dplyr::select(year, est, upr, lwr, sub_area) %>%
  mutate(source = "pred") %>% 
  arrange(year, sub_area)

d <- pol %>%
  filter(!sub_area == 24) %>% 
  group_by(year, sub_area) %>% 
  summarise(est = mean(biomass)) %>% 
  mutate(source = "data") %>% 
  arrange(year, sub_area)
  
ggplot(bind_rows(p, d), aes(year, est, color = source)) +
  geom_point(size = 4) + 
  geom_line(size = 1) + 
  facet_wrap(~ sub_area, scales = "free", ncol = 2) +
  geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.4) + 
  theme_classic(base_size = 15) + 
  ggtitle("Polychaeta")
#> Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
#> Inf

#> Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
#> Inf

#> Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
#> Inf

Cumacea

Read data

# cuma <- readr::read_csv("https://raw.githubusercontent.com/maxlindmark/bentfish/main/data/for_analysis/cumacea.csv") %>%
#   dplyr::select(-X1)
cuma <- read.csv("data/for_analysis/cumacea.csv") %>% dplyr::select(-X)

cuma <- cuma %>%
  filter(lat < ymax & lat > ymin & lon > xmin & lon < xmax) %>% 
  filter(year > 2014) %>% 
  mutate(year_f = as.factor(year_f),
         depth_scaled_sq = depth_scaled*depth_scaled)

unique(is.na(cuma))
#>       year month   day   lon   lat species_group depth prov_nr provID sub_area
#> [1,] FALSE FALSE FALSE FALSE FALSE         FALSE FALSE   FALSE  FALSE    FALSE
#>       taxa sample_id depth_scaled year_f biomass abundance biomass_g_m2
#> [1,] FALSE     FALSE        FALSE  FALSE   FALSE     FALSE        FALSE
#>      biomass_g_km2 biomass_kg_km2 depth_scaled_sq
#> [1,]         FALSE          FALSE           FALSE

nrow(cuma)
#> [1] 841

hist(cuma$biomass)


# Plot depth
p1 <- ggplot(cuma, aes(depth)) +
  geom_histogram() +
  theme_classic(base_size = 18) +
  ggtitle("all samples")

p2 <- filter(cuma, biomass > 0) %>%
  ggplot(., aes(depth)) +
  geom_histogram() +
  theme_classic(base_size = 18) +
  ggtitle("positive biomass")

p1/p2


ggsave("figures/cumacea_depth.png", width = 9, height = 9, dpi = 600)

Read and crop pred grid to match cumacea

## Read the prediction grid
# pred_grid <- readr::read_csv("https://raw.githubusercontent.com/maxlindmark/bentfish/main/data/for_analysis/pred_grid.csv")
pred_grid <- read.csv("data/for_analysis/pred_grid.csv")

pred_grid <- pred_grid %>%
  mutate(depth_scaled = (depth - mean(cuma$depth))/sd(cuma$depth)) %>% 
  mutate(depth_scaled_sq = depth_scaled*depth_scaled) %>% 
  mutate(year_f = factor(year),
         year = as.integer(year)) %>% 
  filter(year > 2014) 

# Subset the prediction grid to match polychaeta data
pred_grid_cuma <- pred_grid

tf_cuma <- exclude.too.far(pred_grid_cuma$lon, pred_grid_cuma$lat, cuma$lon, cuma$lat, 0.01) # 0.03 seems reasonable

# Filter the grid points that are not too far from the data
pred_grid_cuma$too_far <- tf_cuma

# Plot again
pred_grid_cuma %>%
  filter(too_far == FALSE) %>%
  ggplot(., aes(lon, lat, fill = factor(sub_area))) +
  geom_raster() +
  geom_point(data = cuma, aes(lon, lat), color = "black", size = 0.5) +
  NULL


pred_grid_cuma <- pred_grid_cuma %>% filter(too_far == FALSE)

Fit models using sdmTMB assuming a Tweedie distribution and biomass as the response.

Make spde mesh

# Non-island version
cuma_spde <- make_mesh(data = cuma, xy_cols = c("lon", "lat"), n_knots = 80, type = "kmeans", seed = 42)

Fit the model

m_cuma <- sdmTMB(formula = biomass ~ year_f - 1 + depth_scaled + depth_scaled_sq, data = cuma,
                 time = "year", spde = cuma_spde, family = tweedie(link = "log"),
                 ar1_fields = TRUE, include_spatial = FALSE, spatial_trend = FALSE, spatial_only = FALSE)
#> Warning: The model may not have converged: non-positive-definite Hessian matrix.

# Check model
print(m_cuma)
#> Warning in sqrt(diag(object$cov.fixed)): NaNs produced
#> Warning in sqrt(diag(object$cov.fixed)): NaNs produced

#> Warning in sqrt(diag(object$cov.fixed)): NaNs produced

#> Warning in sqrt(diag(object$cov.fixed)): NaNs produced

#> Warning in sqrt(diag(object$cov.fixed)): NaNs produced

#> Warning in sqrt(diag(object$cov.fixed)): NaNs produced
#> Spatiotemporal model fit by ML ['sdmTMB']
#> Formula: biomass ~ year_f - 1 + depth_scaled + depth_scaled_sq
#> Time column: "year"
#> SPDE: cuma_spde
#> Data: cuma
#> Family: tweedie(link = 'log')
#>                 coef.est coef.se
#> year_f2015         -9.25    1.67
#> year_f2016         -8.58    1.68
#> year_f2017         -8.49    1.67
#> year_f2018         -8.52    1.67
#> year_f2019         -7.69    1.66
#> depth_scaled        1.94    0.53
#> depth_scaled_sq    -1.04    0.24
#> 
#> Matern range parameter: 0.52
#> Dispersion parameter: 0.61
#> Spatiotemporal SD (sigma_E): 6.72
#> Spatiotemporal AR1 correlation (rho): 1.00
#> ML criterion at convergence: 162.024

# Check AR1 parameter
tidy(m_cuma, effects = "ran_pars", conf.int = TRUE) %>% filter(term == "rho")
#> Warning in sqrt(diag(object$cov.fixed)): NaNs produced

#> Warning in sqrt(diag(object$cov.fixed)): NaNs produced

#> Warning in sqrt(diag(object$cov.fixed)): NaNs produced

#> Warning in sqrt(diag(object$cov.fixed)): NaNs produced
#> filter: removed 3 rows (75%), one row remaining
#>   term estimate std.error conf.low conf.high
#> 1  rho        1       NaN      NaN       NaN
# Quite large

# Check residuals
d2_cuma <- cuma
d2_cuma$residuals <- residuals(m_cuma)

# OK
qqnorm(d2_cuma$residuals); abline(a = 0, b = 1)


# Lastly, plot residuals on a map
ggplot(d2_cuma, aes(lon, lat, colour = residuals)) +
  geom_point(size = 0.5) +
  facet_wrap(~year) +
  scale_color_gradient2()


# Plot the marginal effect of depth:
nd_cuma <- data.frame(depth_scaled = seq(min(cuma$depth_scaled), max(cuma$depth_scaled), length.out = 100))
nd_cuma$depth_scaled_sq <- nd_cuma$depth_scaled*nd_cuma$depth_scaled
nd_cuma$year <- as.integer(max(cuma$year))
nd_cuma$year_f <- factor(nd_cuma$year)

p_cuma <- predict(m_cuma, newdata = nd_cuma, se_fit = TRUE, re_form = NA)

ggplot(p_cuma, aes(depth_scaled, exp(est),
  ymin = exp(est - 1.96 * est_se), ymax = exp(est + 1.96 * est_se))) +
  geom_line() + geom_ribbon(alpha = 0.4)

Make prediction onto the new grid

# Predict using the Tweedie model
cuma_pred_grid <- predict(m_cuma, newdata = pred_grid_cuma)

# Plot predictions!
cuma_pred_grid %>% 
  ggplot(., aes(lon, lat, fill = est)) +
  geom_raster() +
  facet_wrap(~year, ncol = 2) +
  scale_fill_viridis(option = "magma", na.value = "transparent") + 
  geom_sf(data = world, inherit.aes = F, size = 0.2) +
  coord_sf(xlim = c(xmin, xmax), ylim = c(ymin, ymax)) +
  ggtitle("Prediction (random + fixed)")
#> Warning: Raster pixels are placed at uneven horizontal intervals and will be
#> shifted. Consider using geom_tile() instead.
#> Warning: Raster pixels are placed at uneven vertical intervals and will be
#> shifted. Consider using geom_tile() instead.


# Plot spatiotemporal random effects
cuma_pred_grid %>% 
  ggplot(., aes(lon, lat, fill = epsilon_st)) +
  geom_raster() +
  facet_wrap(~year, ncol = 2) +
  scale_fill_gradient2(na.value = "transparent") +
  geom_sf(data = world, inherit.aes = F, size = 0.2) +
  coord_sf(xlim = c(xmin, xmax), ylim = c(ymin, ymax)) + 
  ggtitle("Spatiotemporal random effect")
#> Warning: Raster pixels are placed at uneven horizontal intervals and will be
#> shifted. Consider using geom_tile() instead.

#> Warning: Raster pixels are placed at uneven vertical intervals and will be
#> shifted. Consider using geom_tile() instead.

Now calculate the index by year and sub area

# From these models, predict annual biomass for each sub area
# sort(unique(cuma$sub_area))

# Sub-area 25
cuma_preds25 <- predict(m_cuma, filter(pred_grid_cuma, sub_area == 2),
                        return_tmb_object = TRUE) # Predict using the grid for subarea

cuma_ind25 <- get_index(cuma_preds25, bias_correct = T) # Get the index (sum of all cells in the pred grid)
#> Warning: The model may not have converged: non-positive-definite Hessian matrix.

# The index is now the sum of grid cells in g/area of a grid cell
# First multiply by the area of a grid cell to get raw g
# Then divide that by the total area
ncells25 <- pred_grid_cuma %>% filter(year == 2015 & sub_area == 2) %>% nrow()

# This is the same as simply dividing by ncells2
cuma_ind25 <- cuma_ind25 %>% mutate(est = est / ncells25,
                                    lwr = lwr / ncells25,
                                    upr = upr / ncells25,
                                    sub_area = 2)

# Sub-area 27
cuma_preds27 <- predict(m_cuma, filter(pred_grid_cuma, sub_area == 4),
                        return_tmb_object = TRUE) # Predict using the grid for subarea

cuma_ind27 <- get_index(cuma_preds27, bias_correct = T) # Get the index (sum of all cells in the pred grid)
#> Warning: The model may not have converged: non-positive-definite Hessian matrix.

# The index is now the sum of grid cells in g/area of a grid cell
# First multiply by the area of a grid cell to get raw g
# Then divide that by the total area
ncells27 <- pred_grid_cuma %>% filter(year == 2015 & sub_area == 4) %>% nrow()

# This is the same as simply dividing by ncells2
cuma_ind27 <- cuma_ind27 %>% mutate(est = est / ncells27,
                                    lwr = lwr / ncells27,
                                    upr = upr / ncells27,
                                    sub_area = 4)
 
# Sub-area 28
cuma_preds28 <- predict(m_cuma, filter(pred_grid_cuma, sub_area == 5),
                        return_tmb_object = TRUE) # Predict using the grid for subarea

cuma_ind28 <- get_index(cuma_preds28, bias_correct = T) # Get the index (sum of all cells in the pred grid)
#> Warning: The model may not have converged: non-positive-definite Hessian matrix.

# The index is now the sum of grid cells in g/area of a grid cell
# First multiply by the area of a grid cell to get raw g
# Then divide that by the total area
ncells28 <- pred_grid_cuma %>% filter(year == 2015 & sub_area == 5) %>% nrow()

# This is the same as simply dividing by ncells2
cuma_ind28 <- cuma_ind28 %>% mutate(est = est / ncells28,
                                    lwr = lwr / ncells28,
                                    upr = upr / ncells28,
                                    sub_area = 5)

# Merge prediction-data
cuma_preds <- bind_rows(cuma_ind25, cuma_ind27, cuma_ind28)

# Compare to data quickly
p <- cuma_preds %>%
  dplyr::select(year, est, upr, lwr, sub_area) %>%
  mutate(source = "pred") %>% 
  arrange(year, sub_area)

d <- cuma %>%
  filter(!sub_area == 24) %>% 
  group_by(year, sub_area) %>% 
  summarise(est = mean(biomass)) %>% 
  mutate(source = "data") %>% 
  arrange(year, sub_area)
  
ggplot(bind_rows(p, d), aes(year, est, color = source)) +
  geom_point(size = 4) + 
  geom_line(size = 1) + 
  facet_wrap(~ sub_area, scales = "free", ncol = 2) +
  geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.4) + 
  theme_classic(base_size = 15) + 
  ggtitle("Cumacea")
#> Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
#> Inf
#> Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
#> Inf

#> Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
#> Inf

Merge data to a full index

sad_preds <- sad_preds %>%
  dplyr::select(year, sub_area, est, lwr, upr) %>% 
  mutate(species_group = "Saduria entomon", source = "model")
#> mutate: new variable 'species_group' (character) with one unique value and 0% NA
#>         new variable 'source' (character) with one unique value and 0% NA

amp_preds <- amp_preds %>%
  dplyr::select(year, sub_area, est, lwr, upr) %>% 
  mutate(species_group = "Amphipoda", source = "model")
#> mutate: new variable 'species_group' (character) with one unique value and 0% NA
#>         new variable 'source' (character) with one unique value and 0% NA

myt_preds <- myt_preds %>%
  dplyr::select(year, sub_area, est, lwr, upr) %>% 
  mutate(species_group = "Mytiloida", source = "model")
#> mutate: new variable 'species_group' (character) with one unique value and 0% NA
#>         new variable 'source' (character) with one unique value and 0% NA

lim_preds <- lim_preds %>%
  dplyr::select(year, sub_area, est, lwr, upr) %>% 
  mutate(species_group = "Limecola balthica", source = "model")
#> mutate: new variable 'species_group' (character) with one unique value and 0% NA
#>         new variable 'source' (character) with one unique value and 0% NA

pol_preds <- pol_preds %>%
  dplyr::select(year, sub_area, est, lwr, upr) %>% 
  mutate(species_group = "Polychaeta", source = "model")
#> mutate: new variable 'species_group' (character) with one unique value and 0% NA
#>         new variable 'source' (character) with one unique value and 0% NA

cuma_preds <- cuma_preds %>%
  dplyr::select(year, sub_area, est, lwr, upr) %>% 
  mutate(species_group = "Cumacea", source = "model")
#> mutate: new variable 'species_group' (character) with one unique value and 0% NA
#>         new variable 'source' (character) with one unique value and 0% NA

pred_dat <- bind_rows(sad_preds, amp_preds, myt_preds, lim_preds, pol_preds, cuma_preds)

# Now add in the data
dat <- bind_rows(sad, amp, myt, lim, pol, cuma) %>% dplyr::select(year, sub_area, species_group, biomass)

avg_dat <- dat %>% 
  group_by(year, sub_area, species_group) %>% 
  summarize(est = mean(biomass)) %>% 
  mutate(source = "data")
#> group_by: 3 grouping variables (year, sub_area, species_group)
#> summarize: now 90 rows and 4 columns, 2 group variables remaining (year, sub_area)
#> mutate (grouped): new variable 'source' (character) with one unique value and 0% NA

full_dat <- bind_rows(avg_dat, pred_dat)

# Plot
ggplot(full_dat, aes(year, est, color = factor(sub_area), shape = source, linetype = source)) +
  geom_point(size = 3) + 
  geom_line(size = 0.5) + 
  scale_color_brewer(palette = "Dark2", name = "sub-area") +
  facet_wrap(~ species_group, scales = "free", ncol = 3) +
  theme_classic(base_size = 15) + 
  theme(legend.position = "bottom")

ggsave("figures/index_data.png", width = 9, height = 9, dpi = 600)

# Now with ribbons and without data
full_dat %>% 
  filter(source == "model") %>% 
  ggplot(., aes(year, est, color = factor(sub_area), fill = factor(sub_area))) +
  geom_point(size = 4) + 
  geom_line(size = 1) + 
  scale_color_brewer(palette = "Dark2", name = "sub-area") +
  scale_fill_brewer(palette = "Dark2", name = "sub-area") +
  facet_wrap(~ species_group, scales = "free", ncol = 3) +
  geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.2, color = NA) + 
  theme_classic(base_size = 15) + 
  theme(legend.position = "bottom")
#> filter (grouped): removed 90 rows (50%), 90 rows remaining


ggsave("figures/index_ci.png", width = 9, height = 9, dpi = 600)

write.csv(full_dat, "output/benthic_indicies.csv")